Overview of mutation burden and telomere length analysis

Summary

  1. Initial analysis of single nucleotide variant and indel burden
  2. SNV and indel burden plots
  3. Copy number and structural variant plots
  4. Telomere length calculations and normality testing
  5. Linear modelling

Open libraries

suppressMessages(library(GenomicRanges))
suppressMessages(library(dplyr))
suppressMessages(library(tidyr))
suppressMessages(library(ggplot2))
suppressMessages(library(BiocGenerics))
suppressMessages(library(knitr))
suppressMessages(library(stringr))
suppressMessages(library(ggtree))
suppressMessages(library(ape))
suppressMessages(library(seqinr))
suppressMessages(library(data.table))
suppressMessages(library(phytools))
suppressMessages(library(Rsamtools))
suppressMessages(library(MASS))
suppressMessages(library(lme4))
suppressMessages(library(stats))
suppressMessages(library(RColorBrewer))
suppressMessages(library(ggpubr))

1) Initial analysis of single nucleotide variant and indel burden

The initial analysis of single nucleotide variant and indel burdens is illustrated using the KX001 (29 year male) dataset. The input files for this part of the analysis are available on Mendeley Data (data post removal of samples that failed QC): 1) mats_and_params_KX001_4_01 2) tree_KX001_4_01_standard_rho01.tree 3) annotated_mut_set_KX001_4_01__standard_rho01 4) KX001_4_sensitivity Note these input files are also available for all the individuals analysed.

Set working directory and define files and paths
ID = "KX001" #Edit
Iteration = "KX001_4" #Edit
Run_ID = "KX001_4_01" #Edit
filtering_ID = "standard_rho01" #Edit
lim_snv = 1000 #Edit- used for ylim_snv on graphs of mutation burden
br_snv = c(200,400,600,800,1000)
lim_indel = 50 #Edit- used for ylim_snv on graphs of mutation burden
br_indel = c(10,20,30,40,50)
setwd = paste0("~/Documents/PhD/Sequencing_results/DNA_seq/",ID,"/",Iteration,"/subs/vaf_cut/") #Ensure this directory exists and copy mats_and_params file there
mats_and_param_file = paste0("~/Documents/PhD/Sequencing_results/DNA_seq/",ID,"/",Iteration,"/trees/vaf_cut/output/mats_and_params_", Run_ID)
tree_file_path = paste0("~/Documents/PhD/Sequencing_results/DNA_seq/",ID,"/",Iteration,"/trees/vaf_cut/output/tree_", Run_ID,"_",filtering_ID, ".tree")
file_annot = paste0("~/Documents/PhD/Sequencing_results/DNA_seq/",ID,"/",Iteration,"/trees/vaf_cut/output/annotated_mut_set_", Run_ID,"_",filtering_ID)
sensitivity_df = paste0("~/Documents/PhD/Sequencing_results/DNA_seq/",ID,"/",Iteration,"/trees/vaf_cut/output/",Iteration,"_sensitivity")
Load “local drive” functions
function_files=list.files('~/Documents/Bioinformatics/CGP/Functions/', full.names = TRUE, pattern = ".R")
suppressMessages(sapply(function_files, source))
##         /Users/em16/Documents/Bioinformatics/CGP/Functions//filters_parallel_functions.R
## value   ?                                                                               
## visible FALSE                                                                           
##         /Users/em16/Documents/Bioinformatics/CGP/Functions//targeted_analysis_functions.R
## value   ?                                                                                
## visible FALSE                                                                            
##         /Users/em16/Documents/Bioinformatics/CGP/Functions//tree_functions.R
## value   ?                                                                   
## visible FALSE
Load input files
load(file_annot)
load(mats_and_param_file)
tree <- read.tree(tree_file_path)
sensitivity <- read.table(sensitivity_df, stringsAsFactors = F, header = T)
Save matrices of NV (mutant read number) and NR (total depth)
write.csv(filtered_muts$COMB_mats.tree.build$NR, paste0("~/Documents/PhD/Sequencing_results/DNA_seq/",ID,"/",Iteration,"/subs/vaf_cut/mutation_burden/",Iteration,"_NR.csv"), row.names = T, quote=F)
write.csv(filtered_muts$COMB_mats.tree.build$NV, paste0("~/Documents/PhD/Sequencing_results/DNA_seq/",ID,"/",Iteration,"/subs/vaf_cut/mutation_burden/",Iteration,"_NV.csv"), row.names = T, quote=F)
Assessing SNV burden per colony
SNVs_per_sample = colSums(filtered_muts$COMB_mats.tree.build$Genotype_bin[filtered_muts$COMB_mats.tree.build$mat$Mut_type == "SNV",] == 1)
mean(SNVs_per_sample); sd(SNVs_per_sample); range(SNVs_per_sample)
## [1] 427.4791
## [1] 65.34828
## [1] 228 632
Mutation_burden <- as.data.frame(colSums(filtered_muts$COMB_mats.tree.build$Genotype_bin[filtered_muts$COMB_mats.tree.build$mat$Mut_type == "SNV",] == 1))
Mutation_burden$Mean_depth <- (colSums(filtered_muts$COMB_mats.tree.build$NR)/nrow(filtered_muts$COMB_mats.tree.build$NR))
colnames(Mutation_burden)[1] <- "Number_mutations"
Mutation_burden$Sample <- rownames(Mutation_burden)
Summary_snvs <- as.data.frame(cbind(Mutation_burden[,c(3,1,2)]))
Assessing Indel burden per colony
INDELs_per_sample = colSums(filtered_muts$COMB_mats.tree.build$Genotype_bin[filtered_muts$COMB_mats.tree.build$mat$Mut_type == "INDEL",] == 1)
mean(INDELs_per_sample); sd(INDELs_per_sample); range(INDELs_per_sample)
## [1] 10.54545
## [1] 4.867938
## [1]  1 29
Indel_burden <- as.data.frame(colSums(filtered_muts$COMB_mats.tree.build$Genotype_bin[filtered_muts$COMB_mats.tree.build$mat$Mut_type == "INDEL",] == 1))
Indel_burden$Mean_depth <- (colSums(filtered_muts$COMB_mats.tree.build$NR)/nrow(filtered_muts$COMB_mats.tree.build$NR))
colnames(Indel_burden)[1] <- "Number_indels"
Indel_burden$Sample <- rownames(Indel_burden)
Summary_indels <- as.data.frame(cbind(Indel_burden[,c(3,1,2)]))
Model to correct mutation burden for sequencing depth of coverage

Summary_snvs: data.frame with one row per sample and the following columns:
Mean_depth: the average coverage per sample
Number_mutations: the number of mutations per sample

### Asymptotic model
# Sort data
sorted = sortedXyData(Summary_snvs$Mean_depth, Summary_snvs$Number_mutations)

# Run model
model.as = NLSstAsymptotic(sorted)
# Note: you will be alerted if the model does not converge (typically if data are not asymptotic or there is too little data)

# Look at results
model.as
##          b0          b1         lrc 
## -475.427582 1048.466358   -1.869659
Create model function
mymodel = function(x){
  b0 = model.as[1]
  b1 = model.as[2]
  lrc = model.as[3]
  b0 + b1*(1-exp(-exp(lrc) * x))  
}
Plot model
myrange=1:round(max(Summary_snvs$Mean_depth))
asp = data.frame(x=myrange, y=unlist(lapply(myrange, FUN=mymodel)))
suppressWarnings(ggplot(Summary_snvs) +
  ylim(1,lim_snv)+
  xlim(0,50)+
  geom_point(aes(x=Mean_depth,y=Number_mutations), data=Summary_snvs, pch=21) + 
  theme_bw() +
  scale_fill_brewer(type="qual", palette=3) + 
  ylab("# Mutations") +
  xlab("Coverage per colony")+
  geom_line(aes(x=x, y=y), data=asp)+
  labs(title = "SNV burden with asymptotic regression line"))
## Warning: Removed 3 row(s) containing missing values (geom_path).

Normalize mutation burden to a read depth of 30X
new_Number_mutations = function(rd, Number_mutations){
  Number_mutations + ( mymodel(30) - mymodel(rd)) ## can change depth here
}
Summary_snvs$Number_mutations_adj_as = apply(Summary_snvs, MARGIN=1, FUN=function(X)  new_Number_mutations(rd=as.numeric(X[which(colnames(Summary_snvs)=="Mean_depth")]), Number_mutations=as.numeric(X[which(colnames(Summary_snvs)=="Number_mutations")]) ) )
ggplot(Summary_snvs) +
  ylim(0,lim_snv)+
  xlim(0,50)+
  theme_bw() +
  geom_point(aes(Mean_depth,Number_mutations_adj_as), pch=21) + 
  ylab("# Mutations") +
  xlab("Coverage per colony")+
  labs(title ="SNV burden corrected for depth of sequencing")

ggplot(Summary_snvs) +
  ylim(0,lim_snv)+
  xlim(0,50)+
  theme_bw() +
  geom_smooth(aes(Mean_depth,Number_mutations_adj_as), pch=21) + 
  ylab("# Mutations") +
  xlab("Coverage per colony")+
labs(title ="SNV burden corrected for depth of sequencing")
## Warning: Ignoring unknown parameters: shape
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Save table of adjusted and unadjusted mutation burdens
write.table(Summary_snvs, paste0("~/Documents/PhD/Sequencing_results/DNA_seq/",ID,"/",Iteration,"/subs/vaf_cut/mutation_burden/",Iteration,"_sub_dep_adj.txt"), sep="\t", col.names = T, row.names = F, quote=F)
Model to correct indel burden for sequencing depth of coverage
### Asymptotic model
# Sort data
sorted = sortedXyData(Summary_indels$Mean_depth, Summary_indels$Number_indels)

# Run model
model.as = NLSstAsymptotic(sorted)
# Note: you will be alerted if the model does not converge (typically if data are not asymptotic or there is too little data)
# If data are more linear then a linear regression is reasonable.

# Look at results
model.as
##        b0        b1       lrc 
## -21.30056  47.55293  -2.46786
Create model function
mymodel = function(x){
  b0 = model.as[1]
  b1 = model.as[2]
  lrc = model.as[3]
  b0 + b1*(1-exp(-exp(lrc) * x))  
}
Plot model
myrange=1:round(max(Summary_indels$Mean_depth))
asp = data.frame(x=myrange, y=unlist(lapply(myrange, FUN=mymodel)))
suppressWarnings(ggplot(Summary_indels) +
  ylim(1,lim_indel)+
  xlim(0,50)+
  geom_point(aes(x=Mean_depth,y=Number_indels), data=Summary_indels, pch=21) + 
  theme_bw() +
  scale_fill_brewer(type="qual", palette=3) + 
  ylab("# Indels") +
  xlab("Coverage per colony")+
  geom_line(aes(x=x, y=y), data=asp)+
  labs(title ="Indel burden with asymptotic regression line"))
## Warning: Removed 7 row(s) containing missing values (geom_path).

Normalize mutation burden to a read depth of 30X.
new_Number_indels = function(rd, Number_indels){
  Number_indels + ( mymodel(30) - mymodel(rd)) ## can change depth here
}
Summary_indels$Number_indels_adj_as = apply(Summary_indels, MARGIN=1, FUN=function(X)  new_Number_indels(rd=as.numeric(X[which(colnames(Summary_indels)=="Mean_depth")]), Number_indels=as.numeric(X[which(colnames(Summary_indels)=="Number_indels")]) ) )
ggplot(Summary_indels) +
  ylim(0,lim_indel)+
  xlim(0,50)+
  theme_bw() +
  geom_point(aes(Mean_depth,Number_indels_adj_as), pch=21) + 
  ylab("# Indels") +
  xlab("Coverage per colony")+
  labs(title ="Indel burden corrected for depth of sequencing")

ggplot(Summary_indels) +
  ylim(0,lim_indel)+
  xlim(0,50)+
  theme_bw() +
  geom_smooth(aes(Mean_depth,Number_indels_adj_as), pch=21) + 
  ylab("# Mutations") +
  xlab("Coverage per colony")+
  labs(title ="Indel burden corrected for depth of sequencing")
## Warning: Ignoring unknown parameters: shape
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Save table of unadjusted and adjusted indel burdens per sample
write.table(Summary_indels, paste0("~/Documents/PhD/Sequencing_results/DNA_seq/",ID,"/",Iteration,"/subs/vaf_cut/mutation_burden/",Iteration,"_indel_dep_adj.txt"), sep="\t", col.names = T, row.names = F, quote=F)
Generate input for dnds
dnds_input <- filtered_muts$COMB_mats.tree.build$mat[,2:5]
dnds_input$SampleID <- ID
colnames(dnds_input) <- c("Chr","Pos","Ref","Alt","SampleID")
dnds_input <- dnds_input[,c("SampleID","Chr","Pos","Ref","Alt")]
write.table(dnds_input, paste0("~/Documents/PhD/Sequencing_results/DNA_seq/",ID,"/",Iteration,"/subs/vaf_cut/dnds/",Iteration, "_dnds_all.txt"), sep="\t", col.names = T, row.names = F, quote=F)

create_dnds_files = function(mat, select_vector = NULL) {
  if(is.null(select_vector)) {dnds_file = mat[,2:5]} else {dnds_file = mat[select_vector,2:5]}
  dnds_file$SampleID <- ID
  colnames(dnds_file) <- c("Chr","Pos","Ref","Alt","SampleID")
  dnds_file <- dnds_file[,c("SampleID","Chr","Pos","Ref","Alt")]
  return(dnds_file)
}

shared_dnds_file = create_dnds_files(filtered_muts$COMB_mats.tree.build$mat, select_vector = filtered_muts$COMB_mats.tree.build$mat$mut_ref %in% rownames(filtered_muts$Genotype_shared_bin))

private_dnds_file = create_dnds_files(filtered_muts$COMB_mats.tree.build$mat, select_vector = !filtered_muts$COMB_mats.tree.build$mat$mut_ref %in% rownames(filtered_muts$Genotype_shared_bin))

write.table(shared_dnds_file, paste0("~/Documents/PhD/Sequencing_results/DNA_seq/",ID,"/",Iteration,"/subs/vaf_cut/dnds/",Iteration, "_dnds_shared.txt"), sep="\t", col.names = T, row.names = F, quote=F)

write.table(private_dnds_file, paste0("~/Documents/PhD/Sequencing_results/DNA_seq/",ID,"/",Iteration,"/subs/vaf_cut/dnds/",Iteration, "_dnds_private.txt"), sep="\t", col.names = T, row.names = F, quote=F)
Table of mutation calls per sample from tree
new_genotype_mat <- filtered_muts$COMB_mats.tree.build$Genotype_bin
#Set all the genotypes to 0 as a baseline
new_genotype_mat[,] <- 0
details <- filtered_muts$COMB_mats.tree.build$mat
 
#Now go through each of the mutations in turn and replace the 0's with 1's
for(i in tree$edge[,2]) {
  node=i
  info=get_edge_info(tree,details,node)
  samples=info$samples
  muts=details$mut_ref[info$idx]
  new_genotype_mat[muts,samples] <- 1
}

write.csv(new_genotype_mat, paste0("~/Documents/PhD/Sequencing_results/DNA_seq/",ID,"/",Iteration,"/subs/vaf_cut/mutation_burden/",Iteration, "_mutation_calls_tree.csv"),row.names = T, quote=F)
Creating vcf files per sample
new_genotype_mat <- cbind(filtered_muts$COMB_mats.tree.build$mat[,2:5],new_genotype_mat[,])

samples = colnames(new_genotype_mat[5:ncol(new_genotype_mat)])

for (sample in samples) {
#Write vcf files for VariantCaller analysis - separate out "All mutations", "Shared mutations", "Private mutations".
vcf_file = new_genotype_mat[(new_genotype_mat[,sample] >0),c(1:4)]
names(vcf_file) = c("#CHROM", "POS", "REF", "ALT")

#Add the required null columns
vcf_file$ID = vcf_file$QUAL = vcf_file$FILTER = vcf_file$INFO = "."

#Rearrange into correct order
vcf_file = vcf_file[,c(1,2,5,3,4,6,7,8)]

#Write files
write.table(vcf_file, sep = "\t", quote = FALSE, file = paste0("~/Documents/PhD/Sequencing_results/DNA_seq/",ID,"/",Iteration,"/subs/vaf_cut/hdp/",sample,"_mutations_all.vcf"), row.names = FALSE)
}
Generate input for signature analysis
all_muts <- filtered_muts$COMB_mats.tree.build$mat[filtered_muts$COMB_mats.tree.build$mat$Mut_type == "SNV",]
all_muts <- all_muts[,c(2:5,7)]
all_muts$SampleID <- paste0(ID, "_", all_muts$node)
all_muts <- all_muts[,c(1:4,6)]
colnames(all_muts) <- c("Chr","Pos","Ref","Alt","SampleID")
Generate mutation spectrum of all samples pooled
genomeFile = "/Users/em16/Documents/Bioinformatics/CGP/Reference_genomes/hs37d5.fa"

subs_only = all_muts
nrow(subs_only)
## [1] 160216
  colnames(subs_only) = c("chr", "pos", "ref", "mut")
  subs_only = subs_only[(subs_only$ref %in% c("A","C","G","T")) & (subs_only$mut %in% c("A","C","G","T")) & subs_only$chr %in% c(1:22,"X","Y"),]
  subs_only$trinuc_ref = as.vector(scanFa(genomeFile, GRanges(subs_only$chr, IRanges(subs_only$pos-1, subs_only$pos+1))))
  
  # 2. Annotating the mutation from the pyrimidine base
  ntcomp = c(T="A",G="C",C="G",A="T")
  subs_only$sub = paste(subs_only$ref,subs_only$mut,sep=">")
  subs_only$trinuc_ref_py = subs_only$trinuc_ref
  for (j in 1:nrow(subs_only)) {
    if (subs_only$ref[j] %in% c("A","G")) { # Purine base
      subs_only$sub[j] = paste(ntcomp[subs_only$ref[j]],ntcomp[subs_only$mut[j]],sep=">")
      subs_only$trinuc_ref_py[j] = paste(ntcomp[rev(strsplit(subs_only$trinuc_ref[j],split="")[[1]])],collapse="")
    }
  }
  
 # 3. Counting subs
  freqs = table(paste(subs_only$sub,paste(substr(subs_only$trinuc_ref_py,1,1),substr(subs_only$trinuc_ref_py,3,3),sep="-"),sep=","))
  sub_vec = c("C>A","C>G","C>T","T>A","T>C","T>G")
  ctx_vec = paste(rep(c("A","C","G","T"),each=4),rep(c("A","C","G","T"),times=4),sep="-")
  full_vec = paste(rep(sub_vec,each=16),rep(ctx_vec,times=6),sep=",")
  freqs_full = freqs[full_vec]
  freqs_full[is.na(freqs_full)] = 0
  names(freqs_full) = full_vec
  
  xstr = paste(substr(full_vec,5,5), substr(full_vec,1,1), substr(full_vec,7,7), sep="")
  
  colvec = rep(c("dodgerblue","black","red","grey70","olivedrab3","plum2"),each=16)
  y = freqs_full
  maxy = max(y)
  
  h=barplot(y, las=2, col=colvec, border=NA, ylim_snv=c(0,maxy*1.5), space=1, cex.names=0.6, names.arg=xstr, ylab="# SNVs")
## Warning in plot.window(xlim, ylim, log = log, ...): "ylim_snv" is not a
## graphical parameter
## Warning in axis(if (horiz) 2 else 1, at = at.l, labels = names.arg, lty =
## axis.lty, : "ylim_snv" is not a graphical parameter
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...):
## "ylim_snv" is not a graphical parameter
## Warning in axis(if (horiz) 1 else 2, cex.axis = cex.axis, ...): "ylim_snv" is
## not a graphical parameter
  mtext(side=4, text= ID)
  for (j in 1:length(sub_vec)) {
    xpos = h[c((j-1)*16+1,j*16)]
    rect(xpos[1]-0.5, maxy*1.2, xpos[2]+0.5, maxy*1.3, border=NA, col=colvec[j*16])
    text(x=mean(xpos), y=maxy*1.3, pos=3, label=sub_vec[j])
  } 

2) SNV and indel burden plots

This analysis is performed using a matrix of mutation burdens, indel burdens and telomere lengths per sample - available in mutation_telomere_analysis/input/Summary_cut.csv. The .csv file containing information on colony efficiency is available in mutation_telomere_analysis/input/Colony_efficiency_cut.csv)

Load colony efficiency file
col_efficiency <- read.csv("~/Documents/PhD/Lab_work/Colony_efficiency/Colony_efficiency_cut.csv", stringsAsFactors = F)
Load matrix of mutation burdens, indel burdens and telomere lengths per sample
summ_cut <- read.csv("~/Documents/PhD/Sequencing_results/DNA_seq/XX_summary/telomere_mutation/data/Summary_cut.csv",  stringsAsFactors = F)
summ_cut$donor_id <- factor(summ_cut$donor_id, levels = c("CB001", "CB002", "KX001", "KX002","SX001","AX001", "KX007","KX008","KX004","KX003"))
col_efficiency$donor_id <- factor(col_efficiency$donor_id, levels = c("CB001", "CB002", "KX001", "KX002","SX001","AX001", "KX007","KX008","KX004","KX003"))
Plot of colony forming efficiency
ggplot(col_efficiency)+
  theme_bw()+
  scale_fill_manual(values = brewer.pal(10,"Paired")[c(1:4,7,8,5,6,9,10)])+
  labs(y="Colony efficiency", x="Donor")+
  theme(text=element_text(size=12))+
  ylim(0,1.0)+
  geom_col(aes(group = donor_id, y= efficiency, x = donor_id, fill = donor_id), width = 0.9)+
  labs(title = "Colony Efficiency")

ggplot(summ_cut[summ_cut$platform == "hiseq" & summ_cut$cell_type == "HSC",])+
  theme_bw()+
  labs(x="Age (years)", y="Telomere  lenght (bp)")+
  xlim(-10,100)+
  ylim(0,25000)+
  theme(text=element_text(size=12))+
  guides(fill="none")+
  scale_color_manual(values = brewer.pal(10,"Paired")[c(1,3,4,7,8,9,10)])+
  geom_jitter(aes(group = donor_id, x = age, y =tel_length, col = donor_id), width = 1)+
geom_boxplot(aes(group = donor_id, x = age, y =tel_length), outlier.alpha = 0 )+
  labs(title = "Telomere length - all data included")

ggplot(summ_cut[summ_cut$platform == "hiseq" & summ_cut$cell_type == "HSC",])+
  theme_bw()+
  labs(x="Age (years)", y="Telomere length (bp)")+
  xlim(-10,100)+
  ylim(0,15000)+
  theme(text=element_text(size=12))+
  guides(fill="none")+
  scale_color_manual(values = brewer.pal(10,"Paired")[c(1,3,4,7,8,9,10)])+
  geom_jitter(aes(group = donor_id, x = age, y =tel_length, col = donor_id), width = 1)+
  geom_boxplot(aes(group = donor_id, x = age, y =tel_length), outlier.alpha = 0 )+
  labs(title = "Telomere length - y-axis truncated at 15000bp")

ggplot(summ_cut[summ_cut$cell_type == "HSC",])+
  theme_bw()+
  labs(x="Age (years)", y="Number SNVs")+
  xlim(-10,100)+
  ylim(0,2000)+
  theme(text=element_text(size=12))+
  guides(fill="none")+
  scale_color_manual(values = brewer.pal(10,"Paired")[c(1:4,7,8,5,6,9,10)])+
  geom_smooth(data = (summ_cut[summ_cut$cell_type == "HSC",]), method = "lm",aes(x = age, y =sub_adj), size = 0.5, colour = "grey", se = T)+
  geom_jitter(aes(group = donor_id, x = age, y =sub_adj, col = donor_id))+
  labs(title ="Number of single nucleotide variants")
## `geom_smooth()` using formula 'y ~ x'

ggplot(summ_cut[summ_cut$cell_type == "HSC",])+
  theme_bw()+
  labs(x="Age (years)", y="Number indels")+
  xlim(-10,100)+
  ylim(0,100)+
  theme(text=element_text(size=12))+
  geom_smooth(data = (summ_cut[summ_cut$cell_type == "HSC",]), method = "lm",aes(x = age, y =indel_adj), size = 0.5, colour = "grey", se = T)+
  guides(fill="none")+
  scale_color_manual(values = brewer.pal(10,"Paired")[c(1:4,7,8,5,6,9,10)])+
  geom_jitter(aes(group = donor_id, x = age, y =indel_adj, col = donor_id))+
  labs("Number of indels")
## `geom_smooth()` using formula 'y ~ x'

ggplot(subset(summ_cut, summ_cut$donor_id %in% c("CB002","SX001","AX001","KX004")))+
  theme_bw()+
  labs(x="Age (years)", y="Number indels")+
  xlim(-10,100)+
  ylim(0,100)+
  theme(text=element_text(size=12))+
geom_smooth(data = (subset(summ_cut, summ_cut$donor_id %in% c("CB002","SX001","AX001","KX004") & summ_cut$cell_type == "HSC")), method = "lm",aes(x = age, y =indel_adj), size = 0.5, colour = "red", se = T)+
  geom_smooth(data = (subset(summ_cut, summ_cut$donor_id %in% c("CB002","SX001","AX001","KX004") & summ_cut$cell_type == "Progenitor")), method = "lm",aes(x = age, y =indel_adj), size = 0.5, colour = "blue", se = T)+
  labs(title = "Number of indels in progenitors (blue) and HSC/MPPs (red)")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

ggplot(subset(summ_cut, summ_cut$donor_id %in% c("CB002","SX001","AX001","KX004")))+
  theme_bw()+
  labs(x="Age (years)", y="Number SNVs")+
  xlim(-10,100)+
  ylim(0,2000)+
  theme(text=element_text(size=12))+
  geom_smooth(data = (subset(summ_cut, summ_cut$donor_id %in% c("CB002","SX001","AX001","KX004") & summ_cut$cell_type == "HSC")), method = "lm",aes(x = age, y =sub_adj), size = 0.5, colour = "red", se = T)+
  geom_smooth(data = (subset(summ_cut, summ_cut$donor_id %in% c("CB002","SX001","AX001","KX004") & summ_cut$cell_type == "Progenitor")), method = "lm",aes(x = age, y =sub_adj), size = 0.5, colour = "blue", se = T)+
  labs(title = "Number of SNVs in progenitors (blue) and HSC/MPPs (red)")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

3) Copy number and structural variant plots

This analysis is performed using data files available in the mutation_telomere_analysis/input/ folder.

Load matrices of copy number / structural variant type per sample number for each donor
SV_summ <- read.csv("~/Documents/PhD/Sequencing_results/DNA_seq/XX_summary/telomere_mutation/data/SV_summ.csv", stringsAsFactors = F, row.names = 1)
CN_summ <- read.csv("~/Documents/PhD/Sequencing_results/DNA_seq/XX_summary/telomere_mutation/data/CN_summ.csv", stringsAsFactors = F, row.names = 1)
Cut CN / SV matrices to rows of interest
SV_summ <- as.matrix(SV_summ[6:9,],)
CN_summ <- as.matrix(CN_summ[6:9,])
barplot(SV_summ, beside = FALSE, names.arg = c("CB001","CB002","KX001","KX002","SX001","AX001","KX007","KX008","KX004","KX003"), col = brewer.pal(10,"PRGn")[c(2,4,8,10)], ylim = c(0,0.04), cex.names = 0.5, legend.text = T,main = "Number of structural variant events \nexpressed as a fraction of total sample number")

4) Telomere length calculations and normality testing

These analyses are performed using the Summary_cut.csv file loaded earlier as the summ_cut object.

Calculation mean sequencing depth
mean(summ_cut$mean_depth)
## [1] 14.13358
Calculation mean telomere length in cord blood
mean_tel_CB001 <- mean(summ_cut$tel_length[summ_cut$donor_id == "CB001"])
mean_tel_CB001
## [1] 6946.516
Tests of normality of telomere length distributions for each donor
shapiro.test(summ_cut$tel_length[summ_cut$donor_id == "CB001" & summ_cut$cell_type == "HSC" & summ_cut$platform == "hiseq"])
## 
##  Shapiro-Wilk normality test
## 
## data:  summ_cut$tel_length[summ_cut$donor_id == "CB001" & summ_cut$cell_type == "HSC" & summ_cut$platform == "hiseq"]
## W = 0.79028, p-value = 1.058e-15
shapiro.test(summ_cut$tel_length[summ_cut$donor_id == "KX001" & summ_cut$cell_type == "HSC" & summ_cut$platform == "hiseq"])
## 
##  Shapiro-Wilk normality test
## 
## data:  summ_cut$tel_length[summ_cut$donor_id == "KX001" & summ_cut$cell_type == "HSC" & summ_cut$platform == "hiseq"]
## W = 0.97028, p-value = 2.297e-07
shapiro.test(summ_cut$tel_length[summ_cut$donor_id == "KX002" & summ_cut$cell_type == "HSC" & summ_cut$platform == "hiseq"])
## 
##  Shapiro-Wilk normality test
## 
## data:  summ_cut$tel_length[summ_cut$donor_id == "KX002" & summ_cut$cell_type == "HSC" & summ_cut$platform == "hiseq"]
## W = 0.86813, p-value < 2.2e-16
shapiro.test(summ_cut$tel_length[summ_cut$donor_id == "SX001" & summ_cut$cell_type == "HSC" & summ_cut$platform == "hiseq"])
## 
##  Shapiro-Wilk normality test
## 
## data:  summ_cut$tel_length[summ_cut$donor_id == "SX001" & summ_cut$cell_type == "HSC" & summ_cut$platform == "hiseq"]
## W = 0.91815, p-value = 1.847e-08
shapiro.test(summ_cut$tel_length[summ_cut$donor_id == "AX001" & summ_cut$cell_type == "HSC" & summ_cut$platform == "hiseq"])
## 
##  Shapiro-Wilk normality test
## 
## data:  summ_cut$tel_length[summ_cut$donor_id == "AX001" & summ_cut$cell_type == "HSC" & summ_cut$platform == "hiseq"]
## W = 0.95725, p-value = 3.08e-05
shapiro.test(summ_cut$tel_length[summ_cut$donor_id == "KX004" & summ_cut$cell_type == "HSC" & summ_cut$platform == "hiseq"])
## 
##  Shapiro-Wilk normality test
## 
## data:  summ_cut$tel_length[summ_cut$donor_id == "KX004" & summ_cut$cell_type == "HSC" & summ_cut$platform == "hiseq"]
## W = 0.93659, p-value = 0.0009823
shapiro.test(summ_cut$tel_length[summ_cut$donor_id == "KX003" & summ_cut$cell_type == "HSC" & summ_cut$platform == "hiseq"])
## 
##  Shapiro-Wilk normality test
## 
## data:  summ_cut$tel_length[summ_cut$donor_id == "KX003" & summ_cut$cell_type == "HSC" & summ_cut$platform == "hiseq"]
## W = 0.97375, p-value = 0.08012
Q-Q plots showing the correlation between telomere lengths and the normal distribution for each donor
ggqqplot(summ_cut$tel_length[summ_cut$donor_id == "CB001" & summ_cut$cell_type == "HSC" & summ_cut$platform == "hiseq"], main = "CB001 Q-Q plot for telomere length")

ggqqplot(summ_cut$tel_length[summ_cut$donor_id == "KX001" & summ_cut$cell_type == "HSC" & summ_cut$platform == "hiseq"], main = "KX001 Q-Q plot for telomere length")

ggqqplot(summ_cut$tel_length[summ_cut$donor_id == "KX002" & summ_cut$cell_type == "HSC" & summ_cut$platform == "hiseq"], main = "KX002 Q-Q plot for telomere length")

ggqqplot(summ_cut$tel_length[summ_cut$donor_id == "SX001" & summ_cut$cell_type == "HSC" & summ_cut$platform == "hiseq"], main = "SX001 Q-Q plot for telomere length")

ggqqplot(summ_cut$tel_length[summ_cut$donor_id == "AX001" & summ_cut$cell_type == "HSC" & summ_cut$platform == "hiseq"], main = "AX001 Q-Q plot for telomere length")

ggqqplot(summ_cut$tel_length[summ_cut$donor_id == "KX004" & summ_cut$cell_type == "HSC" & summ_cut$platform == "hiseq"])

ggqqplot(summ_cut$tel_length[summ_cut$donor_id == "KX003" & summ_cut$cell_type == "HSC" & summ_cut$platform == "hiseq"], main = "KX003 Q-Q plot for telomere length")

Density plot to visualise distribution of telomere lengths
ggdensity(summ_cut$tel_length[summ_cut$donor_id == "CB001" & summ_cut$cell_type == "HSC" & summ_cut$platform == "hiseq"], main = "CB001", xlab = "Telomere length")

ggdensity(summ_cut$tel_length[summ_cut$donor_id == "KX001" & summ_cut$cell_type == "HSC" & summ_cut$platform == "hiseq"], main = "KX001", xlab = "Telomere length")

ggdensity(summ_cut$tel_length[summ_cut$donor_id == "KX002" & summ_cut$cell_type == "HSC" & summ_cut$platform == "hiseq"], main = "KX002", xlab = "Telomere length")

ggdensity(summ_cut$tel_length[summ_cut$donor_id == "SX001" & summ_cut$cell_type == "HSC" & summ_cut$platform == "hiseq"], main = "SX001", xlab = "Telomere length")

ggdensity(summ_cut$tel_length[summ_cut$donor_id == "AX001" & summ_cut$cell_type == "HSC" & summ_cut$platform == "hiseq"], main = "AX001", xlab = "Telomere length")

ggdensity(summ_cut$tel_length[summ_cut$donor_id == "KX004" & summ_cut$cell_type == "HSC" & summ_cut$platform == "hiseq"])

ggdensity(summ_cut$tel_length[summ_cut$donor_id == "KX003" & summ_cut$cell_type == "HSC" & summ_cut$platform == "hiseq"], main = "KX003", xlab = "Telomere length")

Plot of percentage HSC/MPPs with outlying telomere lengths by age

Outliers determined by interquartile range (IQR) criterion as used by the function boxplot

CB001_norm <- summ_cut[summ_cut$platform == "hiseq" & summ_cut$donor_id == "CB001", ]
CB001_outliers <- boxplot.stats(CB001_norm$tel_length)$out
CB001 <- (length(CB001_outliers)/length(CB001_norm$tel_length))*100
CB001
## [1] 6.965174
KX001_norm <- summ_cut[summ_cut$platform == "hiseq" & summ_cut$donor_id == "KX001", ]
KX001_outliers <- boxplot.stats(KX001_norm$tel_length)$out
KX001 <- (length(KX001_outliers)/length(KX001_norm$tel_length))*100
KX001
## [1] 2.457002
KX002_norm <- summ_cut[summ_cut$platform == "hiseq" & summ_cut$donor_id == "KX002", ]
KX002_outliers <- boxplot.stats(KX002_norm$tel_length)$out
KX002 <- (length(KX002_outliers)/length(KX002_norm$tel_length))*100
KX002
## [1] 5.263158
SX001_norm <- summ_cut[summ_cut$platform == "hiseq" & summ_cut$donor_id == "SX001", ]
SX001_outliers <- boxplot.stats(SX001_norm$tel_length)$out
SX001 <- (length(SX001_outliers)/length(SX001_norm$tel_length))*100
SX001
## [1] 0.5586592
AX001_norm <- summ_cut[summ_cut$platform == "hiseq" & summ_cut$donor_id == "AX001", ]
AX001_outliers <- boxplot.stats(AX001_norm$tel_length)$out
AX001 <- (length(AX001_outliers)/length(AX001_norm$tel_length))*100
AX001
## [1] 1.685393
KX004_norm <- summ_cut[summ_cut$platform == "hiseq" & summ_cut$donor_id == "KX004", ]
KX004_outliers <- boxplot.stats(KX004_norm$tel_length)$out
KX004 <- (length(KX004_outliers)/length(KX004_norm$tel_length))*100
KX004
## [1] 0
KX003_norm <- summ_cut[summ_cut$platform == "hiseq" & summ_cut$donor_id == "KX003", ]
KX003_outliers <- boxplot.stats(KX003_norm$tel_length)$out
KX003 <- (length(KX003_outliers)/length(KX003_norm$tel_length))*100
KX003
## [1] 1.176471
all_out <- rbind(CB001, KX001, KX002, SX001, AX001, KX004, KX003)
age <- c(0,29,38,48,63,77,81)
all_out <- cbind(all_out, age)
colnames(all_out) <- c("outliers", "age")
all_out <- as.data.frame(all_out)
ggplot(data= all_out)+
theme_bw()+
  labs(x="Age (years)", y="Percentage HSC/MPPs with outlying telomere lengths")+
  xlim(-10,100)+
  scale_y_continuous(breaks = c(0,2,4,6,8))+
  theme(text=element_text(size=12))+
  geom_point(data = all_out, aes(x = age, y = all_out$outliers))+
 geom_smooth(data = all_out, method = lm, aes(x = age, y = outliers), se = FALSE)
## Warning: Use of `all_out$outliers` is discouraged. Use `outliers` instead.
## `geom_smooth()` using formula 'y ~ x'

outliers <- all_out$outliers
lm_out <- lm(outliers ~ age)
summary(lm_out)
## 
## Call:
## lm(formula = outliers ~ age)
## 
## Residuals:
##       1       2       3       4       5       6       7 
##  0.7734 -1.5566  1.9255 -2.0279  0.2255 -0.4084  1.0685 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  6.19181    1.22117   5.070  0.00387 **
## age         -0.07511    0.02227  -3.373  0.01983 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.562 on 5 degrees of freedom
## Multiple R-squared:  0.6947, Adjusted R-squared:  0.6336 
## F-statistic: 11.37 on 1 and 5 DF,  p-value: 0.01983

5) Linear models

These analyses are performed using the Summary_cut.csv file loaded earlier as the summ_cut object.

Linear model to assess the effect of age on the accumulation of SNVs in HSC/MPPs
age.mut <- lmer(sub_adj ~ age + (age | donor_id), data = summ_cut[summ_cut$cell_type == "HSC",], REML = F)
## boundary (singular) fit: see ?isSingular
age.mut
## Linear mixed model fit by maximum likelihood  ['lmerMod']
## Formula: sub_adj ~ age + (age | donor_id)
##    Data: summ_cut[summ_cut$cell_type == "HSC", ]
##       AIC       BIC    logLik  deviance  df.resid 
##  37678.92  37715.67 -18833.46  37666.92      3368 
## Random effects:
##  Groups   Name        Std.Dev. Corr
##  donor_id (Intercept)  1.0661      
##           age          0.3811  1.00
##  Residual             63.9693      
## Number of obs: 3374, groups:  donor_id, 10
## Fixed Effects:
## (Intercept)          age  
##       54.57        16.83  
## optimizer (nloptwrap) convergence code: 0 (OK) ; 0 optimizer warnings; 1 lme4 warnings
summary(age.mut)
## Linear mixed model fit by maximum likelihood  ['lmerMod']
## Formula: sub_adj ~ age + (age | donor_id)
##    Data: summ_cut[summ_cut$cell_type == "HSC", ]
## 
##      AIC      BIC   logLik deviance df.resid 
##  37678.9  37715.7 -18833.5  37666.9     3368 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.4623 -0.5092 -0.0154  0.4903  4.1832 
## 
## Random effects:
##  Groups   Name        Variance  Std.Dev. Corr
##  donor_id (Intercept)    1.1367  1.0661      
##           age            0.1452  0.3811  1.00
##  Residual             4092.0705 63.9693      
## Number of obs: 3374, groups:  donor_id, 10
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)  54.5650     2.8988   18.82
## age          16.8318     0.1529  110.06
## 
## Correlation of Fixed Effects:
##     (Intr)
## age -0.346
## optimizer (nloptwrap) convergence code: 0 (OK)
## boundary (singular) fit: see ?isSingular
Linear model to assess the effect of age on the accumulation of indels in HSC/MPPs
age.indel <- lmer(indel_adj ~ age + (age | donor_id), data = summ_cut[summ_cut$cell_type == "HSC",], REML = F)
## boundary (singular) fit: see ?isSingular
age.indel
## Linear mixed model fit by maximum likelihood  ['lmerMod']
## Formula: indel_adj ~ age + (age | donor_id)
##    Data: summ_cut[summ_cut$cell_type == "HSC", ]
##       AIC       BIC    logLik  deviance  df.resid 
## 15767.391 15804.135 -7877.696 15755.391      3368 
## Random effects:
##  Groups   Name        Std.Dev. Corr 
##  donor_id (Intercept) 0.54908       
##           age         0.01895  -1.00
##  Residual             2.48903       
## Number of obs: 3374, groups:  donor_id, 10
## Fixed Effects:
## (Intercept)          age  
##     -0.3301       0.1195  
## optimizer (nloptwrap) convergence code: 0 (OK) ; 0 optimizer warnings; 1 lme4 warnings
summary(age.indel)
## Linear mixed model fit by maximum likelihood  ['lmerMod']
## Formula: indel_adj ~ age + (age | donor_id)
##    Data: summ_cut[summ_cut$cell_type == "HSC", ]
## 
##      AIC      BIC   logLik deviance df.resid 
##  15767.4  15804.1  -7877.7  15755.4     3368 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.9666 -0.5561 -0.1074  0.5364  4.1523 
## 
## Random effects:
##  Groups   Name        Variance  Std.Dev. Corr 
##  donor_id (Intercept) 0.3014859 0.54908       
##           age         0.0003592 0.01895  -1.00
##  Residual             6.1952505 2.48903       
## Number of obs: 3374, groups:  donor_id, 10
## 
## Fixed effects:
##              Estimate Std. Error t value
## (Intercept) -0.330075   0.248525  -1.328
## age          0.119526   0.006897  17.330
## 
## Correlation of Fixed Effects:
##     (Intr)
## age -0.922
## optimizer (nloptwrap) convergence code: 0 (OK)
## boundary (singular) fit: see ?isSingular
Linear model to assess effect of age on telomere length including only the ‘young adult’ individuals
age.tel <- lmer(tel_length ~ age + (age | donor_id), data = subset(summ_cut, summ_cut$platform == "hiseq" & !summ_cut$donor_id %in% c("CB001", "KX003") & summ_cut$cell_type == "HSC"), REML = F)
age.tel
## Linear mixed model fit by maximum likelihood  ['lmerMod']
## Formula: tel_length ~ age + (age | donor_id)
##    Data: 
## subset(summ_cut, summ_cut$platform == "hiseq" & !summ_cut$donor_id %in%  
##     c("CB001", "KX003") & summ_cut$cell_type == "HSC")
##       AIC       BIC    logLik  deviance  df.resid 
## 19159.945 19190.580 -9573.973 19147.945      1213 
## Random effects:
##  Groups   Name        Std.Dev. Corr 
##  donor_id (Intercept) 628.2         
##           age          21.4    -1.00
##  Residual             618.5         
## Number of obs: 1219, groups:  donor_id, 5
## Fixed Effects:
## (Intercept)          age  
##     4543.20       -31.87  
## optimizer (nloptwrap) convergence code: 0 (OK) ; 0 optimizer warnings; 1 lme4 warnings
summary(age.tel)
## Linear mixed model fit by maximum likelihood  ['lmerMod']
## Formula: tel_length ~ age + (age | donor_id)
##    Data: 
## subset(summ_cut, summ_cut$platform == "hiseq" & !summ_cut$donor_id %in%  
##     c("CB001", "KX003") & summ_cut$cell_type == "HSC")
## 
##      AIC      BIC   logLik deviance df.resid 
##  19159.9  19190.6  -9574.0  19147.9     1213 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.4215 -0.6373 -0.1048  0.4944  7.2819 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr 
##  donor_id (Intercept) 394629.0 628.2         
##           age            457.9  21.4    -1.00
##  Residual             382564.9 618.5         
## Number of obs: 1219, groups:  donor_id, 5
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)  4543.20     316.74  14.344
## age           -31.87      10.70  -2.979
## 
## Correlation of Fixed Effects:
##     (Intr)
## age -0.995
## optimizer (nloptwrap) convergence code: 0 (OK)
## boundary (singular) fit: see ?isSingular
Linear model to assess the effect of age and cell type on the accumulation of SNVs
age.cell.type <- lmer(sub_adj ~ age + cell_type + (age | donor_id) + (cell_type | donor_id), data = subset(summ_cut, summ_cut$donor_id %in% c("CB002","SX001","AX001","KX004")), REML = F)
age.cell.type
## Linear mixed model fit by maximum likelihood  ['lmerMod']
## Formula: sub_adj ~ age + cell_type + (age | donor_id) + (cell_type | donor_id)
##    Data: subset(summ_cut, summ_cut$donor_id %in% c("CB002", "SX001", "AX001",  
##     "KX004"))
##       AIC       BIC    logLik  deviance  df.resid 
## 17394.786 17448.336 -8687.393 17374.786      1554 
## Random effects:
##  Groups     Name                Std.Dev. Corr
##  donor_id   (Intercept)          6.14923     
##             age                  0.08256 1.00
##  donor_id.1 (Intercept)          0.45273     
##             cell_typeProgenitor 24.75478 0.06
##  Residual                       62.19801     
## Number of obs: 1564, groups:  donor_id, 4
## Fixed Effects:
##         (Intercept)                  age  cell_typeProgenitor  
##               49.76                16.64                28.72  
## optimizer (nloptwrap) convergence code: 0 (OK) ; 0 optimizer warnings; 1 lme4 warnings
summary(age.cell.type)
## Linear mixed model fit by maximum likelihood  ['lmerMod']
## Formula: sub_adj ~ age + cell_type + (age | donor_id) + (cell_type | donor_id)
##    Data: subset(summ_cut, summ_cut$donor_id %in% c("CB002", "SX001", "AX001",  
##     "KX004"))
## 
##      AIC      BIC   logLik deviance df.resid 
##  17394.8  17448.3  -8687.4  17374.8     1554 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.8958 -0.4078  0.0095  0.5054  3.2297 
## 
## Random effects:
##  Groups     Name                Variance  Std.Dev. Corr
##  donor_id   (Intercept)         3.781e+01  6.14923     
##             age                 6.817e-03  0.08256 1.00
##  donor_id.1 (Intercept)         2.050e-01  0.45273     
##             cell_typeProgenitor 6.128e+02 24.75478 0.06
##  Residual                       3.869e+03 62.19801     
## Number of obs: 1564, groups:  donor_id, 4
## 
## Fixed effects:
##                     Estimate Std. Error t value
## (Intercept)          49.7630     6.9987    7.11
## age                  16.6434     0.1555  107.00
## cell_typeProgenitor  28.7219    14.0138    2.05
## 
## Correlation of Fixed Effects:
##             (Intr) age   
## age         -0.713       
## cll_typPrgn -0.036  0.008
## optimizer (nloptwrap) convergence code: 0 (OK)
## boundary (singular) fit: see ?isSingular
Linear model to assess the effect of age and cell type on the accumulation of indels
age.cell.type.indel <- lmer(indel_adj ~ age + cell_type + (age | donor_id) + (cell_type | donor_id), data = subset(summ_cut, summ_cut$donor_id %in% c("CB002","SX001","AX001","KX004")), REML = F)
## boundary (singular) fit: see ?isSingular
age.cell.type.indel
## Linear mixed model fit by maximum likelihood  ['lmerMod']
## Formula: indel_adj ~ age + cell_type + (age | donor_id) + (cell_type |  
##     donor_id)
##    Data: subset(summ_cut, summ_cut$donor_id %in% c("CB002", "SX001", "AX001",  
##     "KX004"))
##       AIC       BIC    logLik  deviance  df.resid 
##  7310.441  7363.991 -3645.220  7290.441      1554 
## Random effects:
##  Groups     Name                Std.Dev. Corr 
##  donor_id   (Intercept)         0.241736      
##             age                 0.008164 -1.00
##  donor_id.1 (Intercept)         0.255541      
##             cell_typeProgenitor 0.025839 1.00 
##  Residual                       2.481492      
## Number of obs: 1564, groups:  donor_id, 4
## Fixed Effects:
##         (Intercept)                  age  cell_typeProgenitor  
##             0.29289              0.11172              0.04258  
## optimizer (nloptwrap) convergence code: 0 (OK) ; 0 optimizer warnings; 1 lme4 warnings
summary(age.cell.type.indel)
## Linear mixed model fit by maximum likelihood  ['lmerMod']
## Formula: indel_adj ~ age + cell_type + (age | donor_id) + (cell_type |  
##     donor_id)
##    Data: subset(summ_cut, summ_cut$donor_id %in% c("CB002", "SX001", "AX001",  
##     "KX004"))
## 
##      AIC      BIC   logLik deviance df.resid 
##   7310.4   7364.0  -3645.2   7290.4     1554 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.9977 -0.5563 -0.1347  0.5276  4.1544 
## 
## Random effects:
##  Groups     Name                Variance  Std.Dev. Corr 
##  donor_id   (Intercept)         5.844e-02 0.241736      
##             age                 6.664e-05 0.008164 -1.00
##  donor_id.1 (Intercept)         6.530e-02 0.255541      
##             cell_typeProgenitor 6.677e-04 0.025839 1.00 
##  Residual                       6.158e+00 2.481492      
## Number of obs: 1564, groups:  donor_id, 4
## 
## Fixed effects:
##                     Estimate Std. Error t value
## (Intercept)         0.292890   0.359722   0.814
## age                 0.111723   0.006943  16.092
## cell_typeProgenitor 0.042579   0.189443   0.225
## 
## Correlation of Fixed Effects:
##             (Intr) age   
## age         -0.845       
## cll_typPrgn -0.070  0.036
## optimizer (nloptwrap) convergence code: 0 (OK)
## boundary (singular) fit: see ?isSingular